sample_annotation <- left_join(analysis, clinical_annotation, by = "SampleID")
sample_annotation <- left_join(sample_annotation, HMW_ratio, by = "SampleID")
sample_annotation <- sample_annotation %>% mutate(ClassificationDetail = ifelse((sample_annotation$CorrectClassification == FALSE) & (sample_annotation$ClassificationResult == "Normal"), "Inconclusive", ifelse((sample_annotation$CorrectClassification == FALSE) & (sample_annotation$ClassificationResult != "Normal"), "Incorrect tumor", "Correct")))
sample_annotation <- sample_annotation %>% mutate(disease_extent = ifelse(sample_annotation$Metastatic_status == "M+", "Metastatic", "Localized"))
sample_annotation <- sample_annotation %>% mutate(cfRRBS_sWGS_CNA = paste0(cfRRBS_CNA,"-", sWGS_CNA))
sample_annotation$cfDNA_HMW_ratio_binned = cut(sample_annotation$cfDNA_HMW_ratio, c(-Inf, 1, 5, Inf), labels=c("High", "Medium", "Low"))
sample_annotation$DNA_input_binned = cut(sample_annotation$DNA_input, c(-Inf, 5, 10, Inf), labels=c("< 5 ng", "5 - 10 ng", "> 10 ng"))
sample_annotation$ClassificationDetail <- factor(sample_annotation$ClassificationDetail, levels=c("Incorrect tumor", "Correct", "Inconclusive"))
sample_annotation$labelmisclas <- ifelse(sample_annotation$ClassificationDetail == "Incorrect tumor", sample_annotation$ClassificationResult, "")
datatable(sample_annotation, rownames = TRUE, filter="top", options = list(pageLength = 5, scrollX=T), caption = "Overview of the sample annotation (analysis QC + sequencing QC + clinical data")[1] "Overall"
[1] "The number of correct classifications: 49"
[1] "The number of misclassifications: 11"
[1] "The number of total samples: 60"
[1] "The percentage of correct classifications: 81.67%"
[1] "CNV profiles"
[1] "The number of correct classifications with flat CNV profile: 9"
[1] "The number of misclassifications with flat CNV profile: 9"
[1] "The number of total samples with flat CNV profile: 18"
[1] "The percentage of correct classifications with flat CNV profile: 50%"
[1] "Higher than 10%"
[1] "The number of correct classifications with >= 10% eTFx: 36"
[1] "The number of misclassifications with >= 10% eTFx: 0"
[1] "The number of total samples with >= 10% eTFx: 36"
[1] "The percentage of correct classifications with >= 10% eTFx: 100%"
[1] "Lower than 10%"
[1] "The number of correct classifications with < 10% eTFx: 13"
[1] "The number of misclassifications with < 10% eTFx: 11"
[1] "The number of total samples with < 10% eTFx: 24"
[1] "The percentage of correct classifications with < 10% eTFx: 54.17%"
[1] "Lower than 1%"
[1] "The number of correct classifications with < 1% eTFx: 3"
[1] "The number of misclassifications with < 1% eTFx: -6"
[1] "The number of total samples with < 1% eTFx: 7"
[1] "The percentage of correct classifications with < 1% eTFx: 42.86%"
sample_annotation$CorrectClassification <- as.character(sample_annotation$CorrectClassification)
sample_annotation$CorrectClassification <- ifelse(sample_annotation$CorrectClassification == "TRUE", "Correct", "Incorrect")
ggplot(sample_annotation, aes(x = M_Seqs , y = GraphTumor, col = ClassificationDetail)) +
geom_beeswarm(color = "black", cex = 2, size = 2, groupOnX = FALSE) + geom_beeswarm(cex = 2, size = 1.5, groupOnX = FALSE) + theme_point + labs(y = "Tumor type", x = "# reads (M)", col = "Classification call", title = "Number of reads after adaptor trimming") + xlim(0, 40) +theme(panel.grid.major.y=element_line(linetype = "dashed",color="gray88"), axis.text.y=element_text(size=8))ggplot(sample_annotation, aes(x = Prct_mCpG, y = GraphTumor, col = ClassificationDetail)) +
geom_beeswarm(color = "black", cex = 2, size = 2, groupOnX = FALSE) + geom_beeswarm(cex = 2, size = 1.5, groupOnX = FALSE) + theme_point + xlim(0,100) +labs(y = "Tumor type", x = "mCpG (%)", col = "Classification call", title = "Genome-wide methylation of CpGs") + theme(panel.grid.major.y=element_line(linetype = "dashed",color="gray88"), axis.text.y=element_text(size=8))ggplot(sample_annotation, aes(x = Prct_OnBaitBases, y = GraphTumor, col = ClassificationDetail)) +
geom_beeswarm(color = "black", cex = 2, size = 2, groupOnX = FALSE) + geom_beeswarm(cex = 2, size = 1.5, groupOnX = FALSE) + theme_point + xlim(0,100) + labs(y = "Tumor type", x = "On bait bases (%)", col = "Classification call", title = "Percentage of all reads mapping to \n MspI fragments between 20 and 200 bp") + theme(panel.grid.major.y=element_line(linetype = "dashed",color="gray88"), axis.text.y=element_text(size=8))ggplot(sample_annotation, aes(x = Prct_Aligned, y = GraphTumor, col = ClassificationDetail)) +
geom_beeswarm(color = "black", cex = 2, size = 2, groupOnX = FALSE) + geom_beeswarm(cex = 2, size = 1.5, groupOnX = FALSE) + theme_point + xlim(0,100) + labs(title = "Reads mapping uniquely to GRCh37 (with Bismark)", y = "Tumor type", x = "Aligned (%)", col = "Classification call") + theme(panel.grid.major.y=element_line(linetype = "dashed",color="gray88"), axis.text.y=element_text(size=8))ggplot(sample_annotation, aes(x = 100 - Prct_Lambda_mCpG, y = GraphTumor, col = ClassificationDetail)) +
geom_beeswarm(color = "black", cex = 2, size = 2, groupOnX = FALSE) + geom_beeswarm(cex = 2, size = 1.5, groupOnX = FALSE) + theme_point + xlim(92,100) + labs(title = "Bisulfite conversion efficiency", y = "Tumor type", x = "Bisulfite conversion efficiency (%)", col = "Classification call") + theme(panel.grid.major.y=element_line(linetype = "dashed",color="gray88"), axis.text.y=element_text(size=8))
Wilcoxon-Mann-Whitney test with continuity correction (confidence
interval requires proportional odds assumption, but test does
not)
data: scRRBS_comp$Prct_OnBaitBases by scRRBS_comp$Protocol
Mann-Whitney estimate = 0, tie factor = 0.99991, p-value <
0.00000000000000022
alternative hypothesis: two distributions are not equal
95 percent confidence interval:
0.00000000 0.03095146
sample estimates:
Mann-Whitney estimate
0
[1] "cfRRBS % on bait:"
[1] "scRRBS % on bait:"
class_results <- ggplot(sample_annotation, aes(x = reorder(CorrectClassification, desc(CorrectClassification)), y = Estimated_TFx*100, col = ClassificationDetail)) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point + stat_compare_means(label = "p.format", comparisons = list(c("Correct", "Incorrect")), tip.length = 0,bracket.size = 0.5) +
labs(y = "Estimated TFx (%)", x = "Classfication", col = "Classification call")
class_results## [1] "Summary of correctly classified tumors:"
## Estimated_TFx
## Min. :0.0010
## 1st Qu.:0.0880
## Median :0.3090
## Mean :0.3459
## 3rd Qu.:0.5480
## Max. :1.0000
## [1] "Summary of incorrectly classified tumors:"
## Estimated_TFx
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.02100
## Mean :0.02427
## 3rd Qu.:0.03700
## Max. :0.07600
ggplot(sample_annotation, aes(x = reorder(Center, desc(Center)), y = Estimated_TFx*100, col = ClassificationDetail)) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point +
labs(y = "Estimated TFx (%)", x = "Center", col = "Classification call", title = "Misclassifications by center")ggplot(sample_annotation, aes(x = reorder(Center, desc(Center)), y = cfDNA_HMW_ratio, col = ClassificationDetail)) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point +
labs(y = "cfDNA/HMW ratio", x = "Center", col = "Classification call")##
## Wilcoxon-Mann-Whitney test with continuity correction (confidence
## interval requires proportional odds assumption, but test does
## not)
##
## data: sample_annotation$Estimated_TFx by sample_annotation$disease_extent
## Mann-Whitney estimate = 0.5651, tie factor = 0.99947, p-value =
## 0.3928
## alternative hypothesis: two distributions are not equal
## 95 percent confidence interval:
## 0.4188994 0.6990860
## sample estimates:
## Mann-Whitney estimate
## 0.5650954
dz_extent <- ggplot(sample_annotation, aes(x = reorder(disease_extent, desc(disease_extent)), y = Estimated_TFx*100, col = ClassificationDetail)) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point + stat_compare_means(label = "p.format", comparisons = list(c(1,2)), tip.length = 0, bracket.size = 0.5) + labs(y = "Estimated TFx (%)", x = "Disease extent", col = "Classification call")
dz_extent## [1] "Summary of correctly classified tumors:"
## Estimated_TFx
## Min. :0.0000
## 1st Qu.:0.0300
## Median :0.3090
## Mean :0.3345
## 3rd Qu.:0.5450
## Max. :1.0000
## [1] "Summary of incorrectly classified tumors:"
## Estimated_TFx
## Min. :0.000
## 1st Qu.:0.032
## Median :0.166
## Mean :0.248
## 3rd Qu.:0.518
## Max. :0.787
sample_annotation$GraphTumor.ordered <- factor(sample_annotation$GraphTumor, levels=c("NBL", "aRMS", "eRMS", "OS", "EWS", "WT", "CCSK", "MRT", "MB", "ATRT"))
ggplot(sample_annotation, aes(x = reorder(GraphTumor.ordered, desc(GraphTumor.ordered)), y = Estimated_TFx*100, col = ClassificationDetail, label = labelmisclas, shape = reorder(Biomaterial, desc(Biomaterial)))) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point + geom_text_repel(size=3) +
theme(panel.grid.major.y=element_line(linetype = "dashed",color="gray88"), axis.text.y=element_text(size=8)) +
labs(y = "Estimated TFx (%)", x = "Tumor type", col = "Classification call", shape = "Biomaterial") +
coord_flip()fulldeconv <- fulldeconv %>% filter((tumor != "normal")) %>% filter((tumor != "wbc"))
fulldeconv_annot <- left_join(x = sample_annotation, y = fulldeconv, by = c("SampleID" = "SampleID"))
datatable(fulldeconv, rownames = TRUE, filter="top", options = list(pageLength = 5, scrollX=T), caption = "The detected fractions for every sample")ggplot(fulldeconv_annot %>% filter(Estimated_TFx < 0.10), aes(x = tumor, y = fraction, fill = ClassificationDetail)) +
geom_bar(stat="identity") + facet_wrap(~ SampleID + GraphTumor, scales = "free")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(x = "tumor entity", y = "detected fraction", title = "Sample deconvolution for estimated TFx less than 10%")ggplot(fulldeconv_annot %>% filter((Estimated_TFx > 0.10) & (Estimated_TFx < 30)), aes(x = tumor, y = fraction, fill = ClassificationDetail)) +
geom_bar(stat="identity") + facet_wrap(~ SampleID + GraphTumor, scales = "free")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(x = "tumor entity", y = "detected fraction", title = "Sample deconvolution for estimated TFx between 10 and 30%")ggplot(fulldeconv_annot %>% filter((Estimated_TFx > 0.30)), aes(x = tumor, y = fraction, fill = ClassificationDetail)) +
geom_bar(stat="identity") + facet_wrap(~ SampleID + GraphTumor, scales = "free")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(x = "tumor entity", y = "detected fraction", title = "Sample deconvolution for estimated TFx higher than 30%")ratio_cont <- ggplot(sample_annotation, aes(y = Estimated_TFx*100, x = cfDNA_HMW_ratio, col = ClassificationDetail, shape = reorder(Biomaterial, desc(Biomaterial)))) + scale_x_continuous(trans = 'log10') +
geom_point(color = "black", size = 2) + geom_point(size = 1.5) + theme_point +
labs(y = "Estimated TFx (%)", x = "cfDNA-HMW ratio (log)",col = "Classification call", shape = "Biomaterial") + geom_vline(xintercept=1, colour="grey", linetype = "dashed") +geom_vline(xintercept=5, colour="grey", linetype = "dashed")+
annotate("text", x = 1, y = 90, size = 3.5,label="High HMW\n", colour="gray", angle=-270) +
annotate("text", x = 1, y = 90, size = 3.5,label = "\nMedium HMW", colour="gray", angle=90) +
annotate("text", x = 5, y = 90, size = 3.5,label="Medium HMW\n", colour="gray", angle=-270) +
annotate("text", x = 5, y = 90, size = 3.5,label="\nLow HMW", colour="gray", angle=90)
ratio_cont ratio_binned <- ggplot(sample_annotation, aes(y = Estimated_TFx*100, x = cfDNA_HMW_ratio_binned, col = ClassificationDetail, shape = reorder(Biomaterial, desc(Biomaterial)))) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point +
labs(y = "Estimated TFx (%)", x = "HMW contamination",col = "Classification call", shape = "Biomaterial")
ratio_binned
print(paste0("Classification accuracy in the low HMW group: ", sample_annotation %>% filter(cfDNA_HMW_ratio_binned == "Low" & ClassificationDetail == "Correct") %>% nrow, "/", sample_annotation %>% filter(cfDNA_HMW_ratio_binned == "Low") %>% nrow))
[1] "Classification accuracy in the low HMW group: 35/37"
print(paste0("Classification accuracy in the medium HMW group: ", sample_annotation %>% filter(cfDNA_HMW_ratio_binned == "Medium" & ClassificationDetail == "Correct") %>% nrow, "/", sample_annotation %>% filter(cfDNA_HMW_ratio_binned == "Medium") %>% nrow))
[1] "Classification accuracy in the medium HMW group: 7/14"
print(paste0("Classification accuracy in the high HMW group: ", sample_annotation %>% filter(cfDNA_HMW_ratio_binned == "High" & ClassificationDetail == "Correct") %>% nrow, "/", sample_annotation %>% filter(cfDNA_HMW_ratio_binned == "High") %>% nrow))
[1] "Classification accuracy in the high HMW group: 7/9"
print(paste0("Classification accuracy in the medium + high HMW group: ", sample_annotation %>% filter((cfDNA_HMW_ratio_binned == "Medium" | cfDNA_HMW_ratio_binned == "High" ) & ClassificationDetail == "Correct") %>% nrow, "/", sample_annotation %>% filter((cfDNA_HMW_ratio_binned == "Medium" | cfDNA_HMW_ratio_binned == "High" )) %>% nrow))
[1] "Classification accuracy in the medium + high HMW group: 14/23"
print("Samples with discordant CNV profiles")
[1] "Samples with discordant CNV profiles"
sample_annotation_CNAFlat <- sample_annotation %>% filter(cfRRBS_sWGS_CNA == "Flat-CNA")
print(paste0("Low HMW: ", sample_annotation_CNAFlat %>% filter(cfDNA_HMW_ratio_binned == "Low") %>% nrow()))
[1] "Low HMW: 4"
print(paste0("High HMW: ", sample_annotation_CNAFlat %>% filter(cfDNA_HMW_ratio_binned == "High") %>% nrow()))
[1] "High HMW: 5"
print(paste0("Medium HMW: ", sample_annotation_CNAFlat %>% filter(cfDNA_HMW_ratio_binned == "Medium") %>% nrow()))
[1] "Medium HMW: 1"
print(paste0("Total Flat-CNA: ", sample_annotation_CNAFlat %>% nrow()))
[1] "Total Flat-CNA: 10"
print("Samples with concordant CNV profiles")
[1] "Samples with concordant CNV profiles"
sample_annotation_CNACNA <- sample_annotation %>% filter((cfRRBS_sWGS_CNA == "CNA-CNA") | (cfRRBS_sWGS_CNA == "Flat-Flat") )
sample_annotation %>% filter((cfRRBS_sWGS_CNA == "CNA-CNA")) %>% select(CNV_pearson) %>% summary
CNV_pearson
Min. :0.1300
1st Qu.:0.8175
Median :0.8750
Mean :0.8228
3rd Qu.:0.9400
Max. :0.9800
print(paste0("Low HMW: ", sample_annotation_CNACNA %>% filter(cfDNA_HMW_ratio_binned == "Low") %>% nrow()))
[1] "Low HMW: 33"
print(paste0("High HMW: ", sample_annotation_CNACNA %>% filter(cfDNA_HMW_ratio_binned == "High") %>% nrow()))
[1] "High HMW: 4"
print(paste0("Medium HMW: ", sample_annotation_CNACNA %>% filter(cfDNA_HMW_ratio_binned == "Medium") %>% nrow()))
[1] "Medium HMW: 13"
print(paste0("Total CNA-CNA or Flat-Flat: ", sample_annotation_CNACNA %>% nrow()))
[1] "Total CNA-CNA or Flat-Flat: 50"
print("Summary of correctly classified tumors:")
[1] "Summary of correctly classified tumors:"
sample_annotation %>% filter(disease_extent == "Metastatic") %>% select(Estimated_TFx) %>% summary
Estimated_TFx
Min. :0.0000
1st Qu.:0.0300
Median :0.3090
Mean :0.3345
3rd Qu.:0.5450
Max. :1.0000
datatable(describe(sample_annotation_CNACNA, omit = TRUE, quant = c(.25,.75)), rownames = TRUE, filter="top", options = list(pageLength = 5, scrollX=T), caption = "Descriptive statistics of concordant samples (either CNA-CNA or flat-flat)" )ggplot(sample_annotation, aes(y = Estimated_TFx*100, x = IchorCNA_TFx*100, col = cfDNA_HMW_ratio_binned, fill = cfDNA_HMW_ratio_binned)) +
geom_smooth(method="lm", se = TRUE)+ geom_point(color = "black", size = 2) + geom_point(size = 1.5) + theme_point +
geom_abline(slope=1, intercept=0, color = "gray", linetype = "dashed") +
labs(x = "CNV-based eTFx (%)", y = "Methylation-based eTFx (%)", col = "HMW contamination", shape = "HMW contamination", fill = "HMW contamination", title = "Correlation between IchorCNA and cfRRBS-\nbased tumor fraction estimation") + ylim(NA, 100) + xlim(NA, 100) + stat_cor(method = "spearman", aes(label=paste(..r.label.., ifelse(..p.value..< 0.001, "'p < 0.001**'",
ifelse(..p.value..>=0.001 & ..p.value..<0.05, "'p < 0.05*'", paste("'p='",round(..p.value..,2), sep = "~"))),sep = "~")))+ scale_color_brewer(palette="Greens") + scale_fill_brewer(palette="Greens") #paste0("Number of samples with CNA: ", length(sample_annotation$sWGS_CNA [sample_annotation$sWGS_CNA == "CNA" ]))
ggplot(sample_annotation, aes(x = cfRRBS_sWGS_CNA, y = CNV_pearson, fill = cfDNA_HMW_ratio_binned, size = DNA_input_binned)) + scale_size_discrete(range=c(1,4)) +
geom_beeswarm(shape = 21, cex = 3, color = "black", stroke = 0.8) + theme_point +
labs(title = "Correlation between sWGS and cfRRBS-based \n CNV calling", x = "cfRRBS - sWGS copy number aberrations", y = "Pearson R", size = "DNA input (ng)", shape = "HMW contamination", fill = "HMW contamination") + scale_fill_brewer(palette="Greens")ConcordantLowToMed <- sample_annotation %>% filter((cfDNA_HMW_ratio_binned == "Medium" | cfDNA_HMW_ratio_binned == "Low") & (cfRRBS_sWGS_CNA == "CNA-CNA" | cfRRBS_sWGS_CNA =="Flat-Flat" )) %>% nrow()
ConcordantHigh <- sample_annotation %>% filter((cfDNA_HMW_ratio_binned == "High") & (cfRRBS_sWGS_CNA == "CNA-CNA" | cfRRBS_sWGS_CNA =="Flat-Flat" )) %>% nrow()
DiscordantLowToMed <- sample_annotation %>% filter((cfDNA_HMW_ratio_binned == "Medium" | cfDNA_HMW_ratio_binned == "Low") & (cfRRBS_sWGS_CNA == "Flat-CNA")) %>% nrow()
DiscordantHigh <- sample_annotation %>% filter((cfDNA_HMW_ratio_binned == "High") & (cfRRBS_sWGS_CNA == "Flat-CNA")) %>% nrow()
FisherTest_df <- data.frame(row.names = c("Concordant", "Discordant"), LowToMed = c(ConcordantLowToMed,DiscordantLowToMed), High = c(ConcordantHigh, DiscordantHigh))
chisq.test(FisherTest_df)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: FisherTest_df
## X-squared = 8.4706, df = 1, p-value = 0.003609
resids_result <- as.data.frame(read_csv("https://www.dropbox.com/s/g9d3bb1tm2la95l/methatlas_deconv_output_extracranial_residuals.csv?dl=1"))
datatable(resids_result, rownames = TRUE, filter="top", options = list(pageLength = 5, scrollX=T), caption = "Residuals of NNLS")rownames(resids_result) <- resids_result[,1]
#resids_result <- resids_result[,-1]
rownames(resids_result) <- sub("-.*", "", rownames(resids_result))
rownames(resids_result) <- sub("snake*", "", rownames(resids_result))
#row.names(resids_result) <- resids_result[,1]
#resids_result <- resids_result[,-1]
tmp <- rownames(resids_result)[(rownames(resids_result) %in% sample_annotation$SampleID)]
resids_result <- resids_result[tmp,]
resids_result$SampleID <- rownames(resids_result)
resids_result$X1 <- NULL
resids_result <- merge(resids_result, sample_annotation, by.x = "SampleID", by.y = "SampleID")
#print(wmwTest(resids_result$Residuals~resids_result$CorrectClassification))
ggplot(resids_result, aes(x = reorder(CorrectClassification, desc(CorrectClassification)), y = Residuals, col = ClassificationDetail)) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point + stat_compare_means(comparisons = list(c("Correct", "Incorrect")), tip.length = 0,bracket.size = 0.5) +
labs(y = "Residuals", x = "Classfication", col = "Classification call", title = "Residuals of misclassified samples are lower")p <- ggplot(resids_result, aes(sample = scale(Residuals))) + stat_qq() + stat_qq_line() + labs(title = "QQ-plot of residuals")
pp <- ggplot(resids_result, aes(x = scale(Residuals))) + geom_histogram(bins = 20) + labs(title = "Histogram of residuals")
preproducibility <- as.data.frame(read_csv("https://www.dropbox.com/s/0ernrimdmtquiu9/reproducibility_matrix.csv?dl=1", na = c("NA")))
row.names(reproducibility) <- reproducibility[,1]
reproducibility <- reproducibility[,-1]
reproducibility$`Read cutoff` <- gsub("rco", "", reproducibility$`Read cutoff`)
reproducibility$`Read cutoff.ordered` <- factor(reproducibility$`Read cutoff`, levels=c("1", "30", "100"))
ggplot(reproducibility, aes(y=bin, x=count, col = `Read cutoff.ordered`)) + geom_line() + labs( y = "% of samples (n = 60)", x = "% of regions recurrently covered (n = 14095)", col = "read cutoff") + theme_point + scale_y_continuous(breaks=seq(0,100,20)) + scale_x_continuous(breaks=seq(0,100,20))ggplot(HMW_Kapa, aes(x = ProSize, y = cfDNA_HMW_ratio, label = SampleID)) +
geom_abline(slope=1, intercept=0, color = "gray", linetype = "dashed") + geom_point(color = "black", size = 2) + geom_point(size = 1.5) + theme_point +
labs(x = "Observer 1 (CVP)", y = "Observer 2 (RVP)", col = "HMW contamination", shape = "HMW contamination", fill = "HMW contamination", title = "Interrater variability between observer 1 and \n observer 2 cfDNA-HMW ratio") + ylim(NA, 130) + xlim(NA, 130) + stat_cor(method = "pearson", aes(label=paste(..rr.label.., cut(..p..,breaks = c(-Inf, 0.0001, 0.001, 0.01, 0.05, Inf),labels = c("'****'", "'***'", "'**'", "'*'", "'ns'")),sep = "~")))## Cohen's Kappa for 2 Raters (Weights: equal)
##
## Subjects = 51
## Raters = 2
## Kappa = 0.798
##
## z = 7.77
## p-value = 0.00000000000000777
See code on GitHub for more information about preprocessing.
# -----
# lib
# -----
suppressMessages(library("data.table"))
suppressMessages(library("RColorBrewer"))
color.A = colorRampPalette(brewer.pal(4,"BrBG"))(4)[4]
color.B = colorRampPalette(brewer.pal(4,"BrBG"))(4)[2]
color.C = colorRampPalette(brewer.pal(4,"BrBG"))(4)[1]
# -----
# overall fun
# -----
collapse <- function(ratio, chr.lst, collapse.factor=100, merge.factor = 1){
current.n <- 1
ratio.collapsed <- rep(NA, length(ratio))
for (chr in unique(chr.lst)){
n <- length(which(chr.lst == chr))
for (i in (current.n:(current.n + n - 1))){
start = i - collapse.factor / 2
end = i + collapse.factor / 2
if (start < current.n){
start = current.n
}
if (end > current.n + n - 1){
end = current.n + n - 1
}
if (length(which(is.na(ratio[start:end]))) > collapse.factor / 2){ # more than 50% NA -> skip
ratio.collapsed[i] = NA
} else {
ratio.collapsed[i] = mean(ratio[start:end], na.rm = T)
}
i = i + 1
}
ratio.collapsed[current.n] <- NA # start with NA (split chromosomes)
current.n <- current.n + n
}
if (merge.factor > 1){
ratio.collapsed <- cbind(ratio.collapsed[seq(1,length(ratio.collapsed),merge.factor)],
ratio.collapsed[seq(1,length(ratio.collapsed),merge.factor) + 1])
ratio.collapsed <- rowMeans(ratio.collapsed)
}
return(ratio.collapsed)
}
# -----
# load
# -----
sample.annot <- read.csv(annot.file, sep = '\t', header = T, stringsAsFactors = F)
profiles.bins <- list()
profiles.segs <- list()
for (sample in c(sample.annot$base1, sample.annot$base2)){
#print(basename(sample))
bin <- fread(paste0(sample, '_bins.bed'), sep = '\t', header = T)
bin <- bin[bin$chr %in% as.character(1:22)]
seg <- fread(paste0(sample, '_segments.bed'), sep = '\t', header = T)
seg <- seg[seg$chr %in% as.character(1:22)]
bin$ratio.seg <- NA
bin$zscore.seg <- NA
binsize <- bin$end[1] - bin$start[1] + 1
for (i in 1:nrow(seg)){
s <- which(bin$chr == seg$chr[i] & bin$start == seg$start[i])
e <- which(bin$chr == seg$chr[i] & bin$end == seg$end[i])
bin$ratio.seg[s:e] <- seg$ratio[i]
bin$zscore.seg[s:e] <- seg$zscore[i]
}
bin$ratio.col <- collapse(bin$ratio, bin$chr, collapse.factor = 10000000/binsize, merge.factor = 400000 / binsize)
profiles.bins[[sample]] <- bin
profiles.segs[[sample]] <- seg
}
# -----
# main
# -----
# Profile concordance
l.matrix <- matrix(rep(1, 8*3), 3, 8, byrow=T)
l.matrix[2, ] <- 2
l.matrix[3, ] <- 3
l.matrix[3, 7:8] <- 4
for (pt in sample.annot$out.id){
print(pt)
s1 <- sample.annot$base1[sample.annot$out.id == pt]
s2 <- sample.annot$base2[sample.annot$out.id == pt]
#png(paste0(plots.dir, '/', pt, ".png"), width=8,height=4,units="in",res=1024)
par(mar=c(2,2,1,0), mgp=c(0,.4,.2), xpd=NA, oma=c(1,2,1,0))
layout(l.matrix)
b1 <- boxplot.stats(profiles.bins[[s1]]$ratio)
b2 <- boxplot.stats(profiles.bins[[s2]]$ratio)
ylim = c(min(b1$stats[c(1,5)], b2$stats[c(1,5)]), max(b1$stats[c(1,5)], b2$stats[c(1,5)])) * 2.5
plot(profiles.bins[[s1]]$ratio, ylim = ylim, axes=F, ylab='', xlab='', pch=16, col=color.A, cex=.8)
par(new=T)
plot(profiles.bins[[s1]]$ratio.seg, ylim = ylim, axes=F, ylab='', xlab='', pch=16, col=rgb(0.9,0.9,0.9),cex=.2)
chr.lengths <- table(profiles.bins[[s1]]$chr)[match(unique(profiles.bins[[s1]]$chr), names(table(profiles.bins[[s1]]$chr)))]
chr.ends <- c(1, cumsum(chr.lengths))
chr.mids <- chr.ends[2:length(chr.ends)] - chr.lengths / 2
segments(chr.ends[length(chr.ends)] * -.02, 0, chr.ends[length(chr.ends)] * 1.02, 0, lty = 3, lwd = 1)
text(-nrow(profiles.bins[[s1]]) * (-0.05), ylim[2] * 1.4, paste0(pt, " - ", label1), col=color.A, cex=1.3, font=2)
axis(2, las=1, tcl=0.5)
mtext(expression('log'[2]*'(ratio)'),2,2, cex=.8)
segments(chr.ends, ylim[1], chr.ends, ylim[2], lty = 3)
plot(profiles.bins[[s2]]$ratio, ylim = ylim, axes=F, ylab='', xlab='', pch=16, col=color.B, cex=.8)
par(new=T)
plot(profiles.bins[[s2]]$ratio.seg, ylim = ylim, axes=F, ylab='', xlab='', pch=16, col=rgb(0.9,0.9,0.9),cex=.2)
chr.lengths <- table(profiles.bins[[s2]]$chr)[match(unique(profiles.bins[[s2]]$chr), names(table(profiles.bins[[s2]]$chr)))]
chr.ends <- c(1, cumsum(chr.lengths))
chr.mids <- chr.ends[2:length(chr.ends)] - chr.lengths / 2
segments(chr.ends[length(chr.ends)] * -.02, 0, chr.ends[length(chr.ends)] * 1.02, 0, lty = 3, lwd = 1)
text(-nrow(profiles.bins[[s2]]) * (-0.05), ylim[2] * 1.4, paste0(pt, " - ", label2," "), col=color.B, cex=1.3, font=2)
axis(2, las=1, tcl=0.5)
mtext(expression('log'[2]*'(ratio)'),2,2, cex=.8)
segments(chr.ends, ylim[1], chr.ends, ylim[2], lty = 3)
text(chr.mids, ylim[1] - sum(abs(ylim)) * .25, labels=paste0('chr', unique(profiles.bins[[s1]]$chr)), srt=45, cex = .8)
chr.lengths <- table(profiles.bins[[s1]]$chr)[match(unique(profiles.bins[[s1]]$chr), names(table(profiles.bins[[s1]]$chr)))]
chr.ends <- c(1, cumsum(chr.lengths))
chr.mids <- chr.ends[2:length(chr.ends)] - chr.lengths / 2
s2.ratio.col <- profiles.bins[[s2]]$ratio.col[1:nrow(profiles.bins[[s1]])]
plot(profiles.bins[[s1]]$ratio.col, ylim = ylim, axes=F, ylab='', xlab='', pch=16, type = 'l', col=color.A, lwd = 2)
par(new=T)
plot(s2.ratio.col, ylim = ylim, axes=F, ylab='', xlab='', pch=16, type = 'l', col=color.B, lwd = 2)
segments(chr.ends[length(chr.ends)] * -.02, 0, chr.ends[length(chr.ends)] * 1.02, 0, lty = 3, lwd = 1)
axis(2, las=1, tcl=0.5)
mtext(expression('log'[2]*'(ratio)'),2,2, cex=.8)
segments(chr.ends, ylim[1], chr.ends, ylim[2], lty = 3)
text(chr.mids, ylim[1] - sum(abs(ylim)) * .15, labels=unique(profiles.bins[[s2]]$chr), srt=45, cex = .7)
mtext(expression('Chromosomes'), 1, 1.7, cex=.8)
par(mar=c(2,2+1,1,0+2), xpd=F)
ylim <- ylim * .5
trend.1 <- profiles.bins[[s1]]$ratio.col
trend.2 <- s2.ratio.col
plot(trend.1, trend.2, axes=F, ylab='', xlab='', ylim=ylim, xlim=ylim, pch=16, cex=.5)
axis(2, las=1, tcl=0.5)
axis(1, las=1, tcl=0.5)
mtext(expression('log'[2]*'(ratio)'), 1, 2, col = color.A, cex=.8)
mtext(expression('log'[2]*'(ratio)'), 2, 2, col = color.B, cex=.8)
mask <- !(is.na(trend.1) | is.na(trend.2))
p = cor(trend.1[mask], trend.2[mask])
c = coef(lm(trend.2[mask] ~ trend.1[mask]))
int = c[1] ; beta = c[2]
segments(ylim[1], int + ylim[1] * beta, ylim[2], int + ylim[2] * beta, lty=3, lwd=1.5,
col=color.C)
v <- prcomp(cbind(trend.1[mask], trend.2[mask]))$rotation # x, y
beta <- v[2,1]/v[1,1]
int <- mean(trend.2[mask]) - mean(trend.1[mask]) * beta # y - x.b
segments(ylim[1], int + ylim[1] * beta, ylim[2], int + ylim[2] * beta, lty=1, lwd=1.5,
col=color.C)
legend('topleft', paste0('r=', round(p, 2)), bty='n', text.col = color.C)
#dev.off()
}## [1] "Patient_01 - RMS"
## [1] "Patient_02 - RMS"
## [1] "Patient_04 - RMS"
## [1] "Patient_05 - NBL"
## [1] "Patient_06 - NBL"
## [1] "Patient_07 - NBL"
## [1] "Patient_08 - NBL"
## [1] "Patient_10 - NBL"
## [1] "Patient_11 - NBL"
## [1] "Patient_12 - NBL"
## [1] "Patient_13 - NBL"
## [1] "Patient_14 - RMS"
## [1] "Patient_16 - WT"
## [1] "Patient_17 - WT"
## [1] "Patient_18 - ATRT"
## [1] "Patient_19 - aRMS"
## [1] "Patient_20 - MB"
## [1] "Patient_21 - OS"
## [1] "Patient_22 - EWS"
## [1] "Patient_23 - OS"
## [1] "Patient_24 - OS"
## [1] "Patient_25 - WT"
## [1] "Patient_27 - WT"
## [1] "Patient_28 - WT"
## [1] "Patient_30 - WT"
## [1] "Patient_31 - WT"
## [1] "Patient_32 - WT"
## [1] "Patient_33 - MRTK"
## [1] "Patient_34 - OS"
## [1] "Patient_35 - MB"
## [1] "Patient_36 - CCSK_T1"
## [1] "Patient_36 - CCSK_T2"
## [1] "Patient_37 - EWS"
## [1] "Patient_38 - EWS"
## [1] "Patient_39 - EWS"
## [1] "Patient_40 - EWS"
## [1] "Patient_43 - aRMS"
## [1] "Patient_44 - RMS"
## [1] "Patient_45 - RMS"
## [1] "Patient_46 - RMS"
## [1] "Patient_47 - RMS"
## [1] "Patient_48 - RMS"
## [1] "Patient_50 - WT"
## [1] "Patient_51 - WT"
## [1] "Patient_52 - WT"
## [1] "Patient_53 - WT"
## [1] "Patient_54 - WT"
## [1] "Patient_55 - WT"
## [1] "Patient_56 - WT"
## [1] "Patient_57 - RMS"
## [1] "Patient_58 - RMS"
## [1] "Patient_59 - RMS"
## [1] "Patient_60 - RMS"
## [1] "Patient_61 - RMS"
## [1] "Patient_62 - RMS"
## [1] "Patient_63 - WT"
## [1] "Patient_64 - NBL"
## [1] "Patient_65 - NBL"
## [1] "Patient_66 - EWS"
## [1] "Patient_67 - MB"
## [1] "Patient_67 - MB Streck*"
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] data.table_1.12.0 bindrcpp_0.2.2 DT_0.5
## [4] asht_0.9.4 coin_1.3-0 survival_2.43-3
## [7] bpcp_1.3.4 exact2x2_1.6.3 exactci_1.3-3
## [10] ssanv_1.1 irr_0.84.1 lpSolve_5.6.13
## [13] ggbeeswarm_0.6.0 psych_1.8.12 httr_1.3.1
## [16] readxl_1.2.0 ggpubr_0.2 magrittr_1.5
## [19] plotly_4.8.0 RCurl_1.95-4.11 bitops_1.0-6
## [22] ggExtra_0.8 gridExtra_2.3 ggrepel_0.8.0
## [25] scales_1.0.0 RColorBrewer_1.1-2 broom_0.5.1
## [28] reshape2_1.4.3 forcats_0.3.0 stringr_1.3.1
## [31] dplyr_0.7.7 purrr_0.2.5 readr_1.1.1
## [34] tidyr_0.8.1 tibble_2.1.1 ggplot2_3.0.0
## [37] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-137 matrixStats_0.54.0 lubridate_1.7.4
## [4] rprojroot_1.3-2 tools_3.5.1 backports_1.1.4
## [7] R6_2.2.2 vipor_0.4.5 lazyeval_0.2.1
## [10] colorspace_1.3-2 withr_2.1.2 tidyselect_0.2.5
## [13] mnormt_1.5-5 curl_3.2 compiler_3.5.1
## [16] cli_1.1.0 rvest_0.3.2 xml2_1.2.0
## [19] sandwich_2.5-0 labeling_0.3 mvtnorm_1.0-11
## [22] digest_0.6.20 foreign_0.8-71 rmarkdown_1.10
## [25] pkgconfig_2.0.2 htmltools_0.3.6 htmlwidgets_1.3
## [28] rlang_0.4.0 rstudioapi_0.7 shiny_1.2.0
## [31] bindr_0.1.1 generics_0.0.2 zoo_1.8-4
## [34] jsonlite_1.6 crosstalk_1.0.0 modeltools_0.2-22
## [37] Matrix_1.2-14 Rcpp_1.0.2 munsell_0.5.0
## [40] multcomp_1.4-10 stringi_1.2.4 yaml_2.2.0
## [43] MASS_7.3-50 plyr_1.8.4 parallel_3.5.1
## [46] promises_1.0.1 crayon_1.3.4 miniUI_0.1.1.1
## [49] lattice_0.20-35 haven_2.1.0 splines_3.5.1
## [52] hms_0.4.2 knitr_1.20 pillar_1.4.2
## [55] ggsignif_0.5.0 codetools_0.2-15 stats4_3.5.1
## [58] glue_1.3.1 evaluate_0.14 modelr_0.1.2
## [61] httpuv_1.5.0 cellranger_1.1.0 gtable_0.2.0
## [64] assertthat_0.2.1 mime_0.5 libcoin_1.0-4
## [67] xtable_1.8-3 perm_1.0-0.0 later_0.8.0
## [70] viridisLite_0.3.0 beeswarm_0.2.3 TH.data_1.0-10